import requests

from bs4 import BeautifulSoup
import os

# 网页地址
utl = "https://www.sztu.edu.cn/"
# 回应
response = requests.get(utl)
# 设置回应的编码
response.encoding = 'utf-8'
# 获得解析——源码
soup = BeautifulSoup(response.text, 'html.parser')
# 找到新闻对应的标签
items = soup.find_all('div', class_="focus-right-item")
# 遍历每一篇新闻
for item in items:
    # 找到蕴含新闻网页地址的标签
    page = item.find('a')
    # 获得新闻文本网页的地址
    page_src = page.attrs['href']
    # 获得新闻标题
    page_name = page.find('div', class_="title-2 font18 fwb900 clamp1").get_text()
    # 获得新闻文本网页的响应及设置编码
    r = requests.get(utl + page_src)
    r.encoding = 'utf-8'
    # 获得解析
    soup_page = BeautifulSoup(r.text, 'html.parser')
    # 获得所有文本
    txts = soup_page.find_all('div', class_="v_news_content")
    
    # 设置文件夹
    if not os.path.exists('news/'+page_name+'/'):
        os.mkdir('news/'+page_name+'/')
    # 打开文件
    with open('news/'+page_name+'/'+page_name+'.txt', 'w', encoding='utf-8') as f:
        # 遍历文本每一段
        for txt in txts:
            spans = txt.find_all('span')
            # 写入每一段
            for span in spans:
                f.write(span.get_text())
    
    # 获得图片
    # 图片序号
    count = 1
    # 遍历每一段文字
    for txt in txts:
        # 找到每一张图片
        img_s = txt.find_all('img')
        # 遍历每一张图片
        for img in img_s:
            # 找到地址
            img_src = img.attrs['src']
            # 获得响应
            img_r = requests.get(utl + img_src)
            # 写入文件
            with open('news/'+page_name+'/'+str(count)+'.jpg', 'wb') as f:
                f.write(img_r.content)
                count += 1